https://www.kaggle.com/datasets/pranav941/hows-that-dog-for-me
#import neccessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import math
#import dataset
df_rank = pd.read_csv("rank.csv")
df_trait = pd.read_csv("traits.csv")
df_rank.info()
print()
df_rank.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 198 entries, 0 to 197 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Breed 198 non-null object 1 2013 Rank 177 non-null float64 2 2014 Rank 184 non-null float64 3 2015 Rank 184 non-null float64 4 2016 Rank 189 non-null float64 5 2017 Rank 190 non-null float64 6 2018 Rank 192 non-null float64 7 2019 Rank 195 non-null float64 8 2020 Rank 195 non-null float64 dtypes: float64(8), object(1) memory usage: 14.0+ KB
| Breed | 2013 Rank | 2014 Rank | 2015 Rank | 2016 Rank | 2017 Rank | 2018 Rank | 2019 Rank | 2020 Rank | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Retrievers (Labrador) | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
| 1 | German Shepherd Dogs | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 3.0 |
| 2 | Retrievers (Golden) | 3.0 | 3.0 | 3.0 | 3.0 | 3.0 | 3.0 | 3.0 | 4.0 |
| 3 | Beagles | 4.0 | 5.0 | 5.0 | 5.0 | 6.0 | 6.0 | 7.0 | 7.0 |
| 4 | Bulldogs | 5.0 | 4.0 | 4.0 | 4.0 | 5.0 | 5.0 | 5.0 | 5.0 |
df_trait.info()
print()
df_trait.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 195 entries, 0 to 194 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Breed 195 non-null object 1 Affectionate With Family 195 non-null int64 2 Good With Young Children 195 non-null int64 3 Good With Other Dogs 195 non-null int64 4 Shedding Level 195 non-null int64 5 Coat Grooming Frequency 195 non-null int64 6 Drooling Level 195 non-null int64 7 Coat Type 195 non-null object 8 Coat Length 195 non-null object 9 Openness To Strangers 195 non-null int64 10 Playfulness Level 195 non-null int64 11 Watchdog/Protective Nature 195 non-null int64 12 Adaptability Level 195 non-null int64 13 Trainability Level 195 non-null int64 14 Energy Level 195 non-null int64 15 Barking Level 195 non-null int64 16 Mental Stimulation Needs 195 non-null int64 dtypes: int64(14), object(3) memory usage: 26.0+ KB
| Breed | Affectionate With Family | Good With Young Children | Good With Other Dogs | Shedding Level | Coat Grooming Frequency | Drooling Level | Coat Type | Coat Length | Openness To Strangers | Playfulness Level | Watchdog/Protective Nature | Adaptability Level | Trainability Level | Energy Level | Barking Level | Mental Stimulation Needs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Retrievers (Labrador) | 5 | 5 | 5 | 4 | 2 | 2 | Double | Short | 5 | 5 | 3 | 5 | 5 | 5 | 3 | 4 |
| 1 | French Bulldogs | 5 | 5 | 4 | 3 | 1 | 3 | Smooth | Short | 5 | 5 | 3 | 5 | 4 | 3 | 1 | 3 |
| 2 | German Shepherd Dogs | 5 | 5 | 3 | 4 | 2 | 2 | Double | Medium | 3 | 4 | 5 | 5 | 5 | 5 | 3 | 5 |
| 3 | Retrievers (Golden) | 5 | 5 | 5 | 4 | 2 | 2 | Double | Medium | 5 | 4 | 3 | 5 | 5 | 3 | 1 | 4 |
| 4 | Bulldogs | 4 | 3 | 3 | 3 | 3 | 3 | Smooth | Short | 4 | 4 | 3 | 3 | 4 | 3 | 2 | 3 |
#remove \xa0 in column
df_trait['Breed'] = df_trait['Breed'].str.split().str.join(' ')
#combine two dataset based on Breed column
df = pd.merge(df_rank, df_trait, on=['Breed'], how='inner')
df.head()
| Breed | 2013 Rank | 2014 Rank | 2015 Rank | 2016 Rank | 2017 Rank | 2018 Rank | 2019 Rank | 2020 Rank | Affectionate With Family | ... | Coat Type | Coat Length | Openness To Strangers | Playfulness Level | Watchdog/Protective Nature | Adaptability Level | Trainability Level | Energy Level | Barking Level | Mental Stimulation Needs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Retrievers (Labrador) | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 5 | ... | Double | Short | 5 | 5 | 3 | 5 | 5 | 5 | 3 | 4 |
| 1 | German Shepherd Dogs | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 3.0 | 5 | ... | Double | Medium | 3 | 4 | 5 | 5 | 5 | 5 | 3 | 5 |
| 2 | Retrievers (Golden) | 3.0 | 3.0 | 3.0 | 3.0 | 3.0 | 3.0 | 3.0 | 4.0 | 5 | ... | Double | Medium | 5 | 4 | 3 | 5 | 5 | 3 | 1 | 4 |
| 3 | Beagles | 4.0 | 5.0 | 5.0 | 5.0 | 6.0 | 6.0 | 7.0 | 7.0 | 3 | ... | Smooth | Short | 3 | 4 | 2 | 4 | 3 | 4 | 4 | 4 |
| 4 | Bulldogs | 5.0 | 4.0 | 4.0 | 4.0 | 5.0 | 5.0 | 5.0 | 5.0 | 4 | ... | Smooth | Short | 4 | 4 | 3 | 3 | 4 | 3 | 2 | 3 |
5 rows × 25 columns
#Replace NAN with 0
df = df.fillna(0)
#create 2 barplots for the first breed "Retrievers (Labrador)"
y = ['Affectionate With Family', 'Good With Young Children', 'Good With Other Dogs', 'Shedding Level',
'Coat Grooming Frequency', 'Drooling Level', 'Openness To Strangers', 'Playfulness Level',
'Watchdog/Protective Nature', 'Adaptability Level', 'Trainability Level', 'Energy Level', 'Barking Level', 'Mental Stimulation Needs']
fig = px.bar(df[df["Breed"].isin(["Retrievers (Labrador)"])], y=y[0:7], barmode='group')
fig.show()
fig = px.bar(df[df["Breed"].isin(["Retrievers (Labrador)"])], y=y[7:], barmode='group')
fig.show()
#create 2 barplots for the final breed "Pumik"
fig = px.bar(df[df["Breed"].isin(["Pumik"])], y=y[0:7], barmode='group')
fig.show()
fig = px.bar(df[df["Breed"].isin(["Pumik"])], y=y[7:], barmode='group')
fig.show()
#correlation heatmap between variables
sns.heatmap(df.iloc[:,9:].corr(), cmap="PRGn", annot=False, center=0)
plt.show()
df.columns
Index(['Breed', '2013 Rank', '2014 Rank', '2015 Rank', '2016 Rank',
'2017 Rank', '2018 Rank', '2019 Rank', '2020 Rank',
'Affectionate With Family', 'Good With Young Children',
'Good With Other Dogs', 'Shedding Level', 'Coat Grooming Frequency',
'Drooling Level', 'Coat Type', 'Coat Length', 'Openness To Strangers',
'Playfulness Level', 'Watchdog/Protective Nature', 'Adaptability Level',
'Trainability Level', 'Energy Level', 'Barking Level',
'Mental Stimulation Needs'],
dtype='object')
#Replace categorical data to numerical
df['Coat Type'].replace(['Double', 'Smooth', 'Silky', 'Curly', 'Wiry', 'Wavy', 'Hairless', 'Rough', 'Corded'],
[0, 1, 2, 3, 4, 5, 6, 7, 8], inplace=True)
df['Coat Length'].replace(['Short', 'Medium', 'Long'],
[0, 1, 2], inplace=True)
#Histogram visualization
fig = make_subplots(rows=6, cols=3,
specs=[[{"colspan": 1}, None, {"colspan": 1}],
[{}, {}, {}],
[{}, {}, {}],
[{}, {}, {}],
[{}, {}, {}],
[{"colspan": 1}, None, {"colspan": 1}]],
subplot_titles=("Affectionate With Family", "Good With Young Children",
"Good With Other Dogs", "Shedding Level", "Coat Grooming Frequency",
"Drooling Level", "Coat Type", "Coat Length",
"Opennes To Strangers", "Playfulness Level", "Watchdog/Protective Nature",
"Adaptability Level", "Trainability Level", "Energy Level",
"Barking Level", "Mental Stimulation Needs",))
fig.add_trace(go.Histogram(x=df["Affectionate With Family"]), row=1, col=1)
fig.add_trace(go.Histogram(x=df["Good With Young Children"]), row=1, col=3)
fig.add_trace(go.Histogram(x=df["Good With Other Dogs"]), row=2, col=1)
fig.add_trace(go.Histogram(x=df["Shedding Level"]), row=2, col=2)
fig.add_trace(go.Histogram(x=df["Coat Grooming Frequency"]), row=2, col=3)
fig.add_trace(go.Histogram(x=df["Drooling Level"]), row=3, col=1)
fig.add_trace(go.Histogram(x=df["Coat Type"]), row=3, col=2)
fig.add_trace(go.Histogram(x=df["Coat Length"]), row=3, col=3)
fig.add_trace(go.Histogram(x=df["Openness To Strangers"]), row=4, col=1)
fig.add_trace(go.Histogram(x=df["Playfulness Level"]), row=4, col=2)
fig.add_trace(go.Histogram(x=df["Watchdog/Protective Nature"]), row=4, col=3)
fig.add_trace(go.Histogram(x=df["Adaptability Level"]), row=5, col=1)
fig.add_trace(go.Histogram(x=df["Trainability Level"]), row=5, col=2)
fig.add_trace(go.Histogram(x=df["Energy Level"]), row=5, col=3)
fig.add_trace(go.Histogram(x=df["Barking Level"]), row=6, col=1)
fig.add_trace(go.Histogram(x=df["Mental Stimulation Needs"]), row=6, col=3)
fig.update_layout(height=1000, showlegend=False)
fig.show()
#Create new variable "Average Ranking" based on average of ranking from 2013 to 2020, then appends it to main dataframe
avg_rank = []
for i in range(0,len(df)):
temp = 0
count = 0
for j in range(1,9):
if(df.iloc[i,j] != 0):
temp += df.iloc[i,j]
count += 1
else:
pass
temp /= count
avg_rank.append(temp)
df['Average Rank'] = avg_rank
df.head()
| Breed | 2013 Rank | 2014 Rank | 2015 Rank | 2016 Rank | 2017 Rank | 2018 Rank | 2019 Rank | 2020 Rank | Affectionate With Family | ... | Coat Length | Openness To Strangers | Playfulness Level | Watchdog/Protective Nature | Adaptability Level | Trainability Level | Energy Level | Barking Level | Mental Stimulation Needs | Average Rank | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Retrievers (Labrador) | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 5 | ... | 0 | 5 | 5 | 3 | 5 | 5 | 5 | 3 | 4 | 1.000 |
| 1 | German Shepherd Dogs | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 3.0 | 5 | ... | 1 | 3 | 4 | 5 | 5 | 5 | 5 | 3 | 5 | 2.125 |
| 2 | Retrievers (Golden) | 3.0 | 3.0 | 3.0 | 3.0 | 3.0 | 3.0 | 3.0 | 4.0 | 5 | ... | 1 | 5 | 4 | 3 | 5 | 5 | 3 | 1 | 4 | 3.125 |
| 3 | Beagles | 4.0 | 5.0 | 5.0 | 5.0 | 6.0 | 6.0 | 7.0 | 7.0 | 3 | ... | 0 | 3 | 4 | 2 | 4 | 3 | 4 | 4 | 4 | 5.625 |
| 4 | Bulldogs | 5.0 | 4.0 | 4.0 | 4.0 | 5.0 | 5.0 | 5.0 | 5.0 | 4 | ... | 0 | 4 | 4 | 3 | 3 | 4 | 3 | 2 | 3 | 4.625 |
5 rows × 26 columns
#Splitting 70% train, 30% test
X = df.iloc[:,9:]
X = X.drop('Average Rank', axis=1)
y = np.array(df['Average Rank'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=15)
#Create RF Regressor Model with 5000 trees
reg=RandomForestRegressor(n_estimators=5000, random_state=140)
#Fitting
reg.fit(X_train, y_train)
RandomForestRegressor(n_estimators=5000, random_state=140)
#Predict
y_pred = reg.predict(X_test)
#Predicted vs Actual Visualization
px.line(y = [y_pred, y_test], markers=True, title = "Predicted(var0) vs Actual(var1)", labels={"value": "Rank"})
errors = abs(y_pred - y_test)
r2 = r2_score(y_test, y_pred)
print('Average absolute error:', round(np.mean(errors), 2), 'degrees.')
print('R-squared scores:', round(r2,2))
print("MSE =", round(mean_squared_error(y_test, y_pred),2))
print("RMSE =", round(math.sqrt(mean_squared_error(y_test, y_pred)),2))
Average absolute error: 42.44 degrees. R-squared scores: 0.21 MSE = 2342.18 RMSE = 48.4
R-squared score = 21%, meaning the predictors can explain 21% of the variation in the target variable. This result makes the model far below sufficient to be used in real-life situation and should be improved or scrapped entirely.
#Feature selection
plt.barh(X.columns.values, reg.feature_importances_);
I decide to remove all features below 4% importance.
#Splitting 70% train, 30% test after feature selection
X = df.iloc[:,9:]
X = X.drop(['Mental Stimulation Needs', 'Mental Stimulation Needs', 'Energy Level', 'Average Rank'], axis=1)
y = np.array(df['Average Rank'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=15)
#Create RF Regressor model with 5000 trees
reg=RandomForestRegressor(n_estimators=5000, random_state=140)
#Fitting
reg.fit(X_train, y_train)
RandomForestRegressor(n_estimators=5000, random_state=140)
#Predict
y_pred = reg.predict(X_test)
#Predicted vs Actual Visualization
px.line(y = [y_pred, y_test], markers=True, title = "Predicted(var0) vs Actual(var1)", labels={"value": "Rank"})
errors = abs(y_pred - y_test)
r2 = r2_score(y_test, y_pred)
print('Average absolute error:', round(np.mean(errors), 2), 'degrees.')
print('R-squared scores:', round(r2,2))
print("MSE =", round(mean_squared_error(y_test, y_pred),2))
print("RMSE =", round(math.sqrt(mean_squared_error(y_test, y_pred)),2))
Average absolute error: 41.86 degrees. R-squared scores: 0.24 MSE = 2268.18 RMSE = 47.63
R-squared score = 24%, meaning the predictors can explain 24% of the variation in the target variable. While the result has improved, it is still far below the ideal value with at least 70%.